The purpose of this document is to get oriented with the first RNA-Seq data to come out of the Klotho project and with using Synapse. In this workflow we:
rm(list = ls())
library(here)
#expression.type = "transcript"
expression.type = "gene"
min.term.size = 10 #minimum number of genes allowed in a term
fdr.thresh = 0.1 #fdr for differentially expressed genes
p.thresh = 0.05 #nominal p value to look for effects
args <- commandArgs(trailingOnly=T)
subgroup <- args[2]
if(is.na(subgroup)){
#subgroup <- "all ages"
subgroup <- list("age_batch" = 12)
#subgroup <- list("age_batch" = 4)
#subgroup <- list("age_batch" = 4, "sex_ge" = "Female") #example with more than one filter
}
if(subgroup == "all ages"){
results.dir <- here("Results", "all")
}else{
subgroup.results <- paste(sapply(1:length(subgroup),
function(x) paste(names(subgroup)[x], subgroup[[x]], sep = "_")),
collapse = "_")
results.dir <- here("Results", subgroup.results)
}
if(!file.exists(results.dir)){
dir.create(results.dir, TRUE)
}
general.data.dir <- here("Data", "general")
remove.qc <- TRUE #if 1a_Klotho_QC.Rmd has been run, and animals were identified for removal
#set this to TRUE
ordered.geno <- c("FC/FC", "WT/FC", "WT/WT", "WT/VS", "VS/VS")
The subgroup analyzed in this workflow is: age_batch = 12
all.fun <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.fun)){source(all.fun[i])}
processed.data.dir <- file.path(results.dir, "processed_data")
if(!file.exists(processed.data.dir)){dir.create(processed.data.dir, recursive = TRUE)}
general.processed.data.dir <- here("Results", "Processed_Data")
if(!file.exists(general.processed.data.dir)){dir.create(general.processed.data.dir)}
geno.cols <- get_geno_cols(here("Data", "general", "geno_cols.csv"))
needed.libraries <- c("synapser", "pheatmap", "DESeq2", "DT", "vioplot", "RColorBrewer",
"gprofiler2", "cluster", "pathview", "clusterProfiler", "stringr", "igraph", "wordcloud",
"wordcloud2")
load_libraries(needed.libraries)
kl.ensembl <- "ENSMUSG00000058488"
#gene information tables
mouse.gene.table <- read.table(file.path(general.data.dir, "mouse_gene_info.txt"), sep = "\t", header = TRUE)
orthos <- read.table(file.path(general.data.dir, "human.mouse.orthologs.txt"), sep = "\t", header = TRUE)
data_id <- read.table(here("Data", "Synspse_IDs_Klotho.txt")) #made by hand by copying and pasting file names and IDs from Synapse web page
mouse.data.dir <- here("Data", "Mouse")
if(!file.exists(mouse.data.dir)){dir.create(mouse.data.dir)}
file.dest <- file.path(mouse.data.dir, data_id[,1])
need.to.download <- any(!file.exists(file.dest))
if(need.to.download){
synLogin()
logged.in <- TRUE
for(i in 1:length(file.dest)){
if(!file.exists(file.destp[i])){
synGet(data_id[i,2], downloadLocation = mouse.data.dir)
}
}
}
mouse.info <- read.csv(file.path(mouse.data.dir, "metadata_validated.csv"))
orthos <- read.delim(file.path(general.data.dir, "human.mouse.orthologs.txt"))
if(expression.type == "transcript"){
count.data <- read.delim(file.path(mouse.data.dir, "rsem.merged.transcript_counts.tsv"))
}else{
count.data <- read.delim(file.path(mouse.data.dir, "rsem.merged.gene_counts.tsv"))
}
u_id <- unique(mouse.info[,"animalName"])
ind.tables <- lapply(u_id, function(x) mouse.info[which(mouse.info[,"animalName"] == x),])
#This function takes all the information on a single individual and condenses
#it into a more readable format. Each individual has two lines in the table
#one each for the two KL haplotypes. The two lines tell us the genotype.
#we go by the sequenced genotype not by what is in Climb because that is
#more reliable.
#for each of these haplotypes separately.
condense_geno <- function(call, allele){
if(is.na(call)){
return("Inconclusive")
}
if(call == "WT"){
cond.geno <- "WT/WT"
}
if(call == "Homozygous"){
cond.geno <- paste0(allele, "/", allele)
}
if(call == "Heterozygous"){
cond.geno <- paste0("WT/", allele)
}
return(cond.geno)
}
merge.geno <- function(fc.geno, vs.geno){
#if both are wild type
u_geno <- unique(c(fc.geno, vs.geno))
if(any(is.na(u_geno))){
return("Inconclusive")
}
if(any(u_geno == "Inconclusive")){
return("Inconclusive")
}
if(length(u_geno) == 1){
final.geno <- u_geno
}else{
not.wt <- which(u_geno != "WT/WT")
if(length(not.wt) == 1){
final.geno <- u_geno[not.wt]
}else{
final.geno <- "FC/VS"
}
}
return(final.geno)
}
condense_info <- function(individ.table){
vs.idx <- which(individ.table[,"snp"] == "Klotho-VS")
fc.idx <- which(individ.table[,"snp"] == "Klotho-FC")
vs.seq <- condense_geno(call = individ.table[vs.idx,"genotype_seq"], allele = "VS")
fc.seq <- condense_geno(individ.table[fc.idx,"genotype_seq"], "FC")
seq.geno <- merge.geno(fc.seq, vs.seq)
vs.climb <- condense_geno(call = individ.table[vs.idx,"genotype_climb"], allele = "VS")
fc.climb <- condense_geno(individ.table[fc.idx,"genotype_climb"], "FC")
climb.geno <- merge.geno(fc.climb, vs.climb)
#dont.include <- c("genotype_climb", "genotype_seq", "snp", "validated.gt", "line")
dont.include <- c("genotype_climb", "genotype_seq", "snp", "line")
include <- setdiff(colnames(individ.table), dont.include)
final.info <- unlist(c(individ.table[1,include], "seq_genotype" = seq.geno, "climb_geno" = climb.geno))
return(final.info)
}
condensed.table <- t(sapply(ind.tables, condense_info))
#head(condensed.table)
#one mouse was labeled as 24 months, but it should be 12 months (talked with Dylan Garceau)
#change this by hand
wrong.age <- which(condensed.table[,"age_m"] == 24)
condensed.table[wrong.age,"age_m"] <- 12
#remove animals identified in 1a_Klotho_QC.Rmd
if(remove.qc){
#remove animals identified by 1a_Klotho_QC.Rmd
qc.id <- read.table(here("Data", "general", "ID_QC.txt"), header = TRUE)
for(i in 1:nrow(qc.id)){
mouse.id <- qc.id[i,1]
action <- qc.id[i,2]
mouse.idx <- which(condensed.table[,"animalName"] == mouse.id)
if(action == "remove"){
condensed.table <- condensed.table[-mouse.idx,]
}else{
condensed.table[mouse.idx,"sex_climb"] <- action
}
}
}
#intersect(qc.id[,1], condensed.table[,"animalName"])
The table of animals with called genotypes is as follows:
datatable(condensed.table)
write.table(condensed.table, file.path(results.dir, "mouse.info.csv"), sep = ",", quote = FALSE)
The tables below show the numbers of animals in each group. The x axis in each panel shows the age of the mice in months. The y axis shows each included genotype.
The largest group right now is 12-month-old WT mice. There are no FC/VS mice yet.
sexes <- unique(condensed.table[,"sex_ge"]) #use the sequenced sex
ages <- sort(as.numeric(unique(condensed.table[,"age_m"])))
genotypes <- unique(condensed.table[,"climb_geno"])[c(2,5,4,1,3,6)]
#build a separate table for each allele and sex and count the
#mice in each age group.
count.tables <- vector(mode = "list", length = length(sexes))
names(count.tables) <- sexes
for(s in 1:length(sexes)){
sex.idx <- which(condensed.table[,"sex_climb"] == sexes[s])
by.age <- matrix(0, nrow = length(genotypes), ncol = length(ages))
rownames(by.age) <- genotypes
colnames(by.age) <- ages
for(g in 1:length(genotypes)){
genotype.idx <- grep(genotypes[g], condensed.table[,"climb_geno"])
for(ag in 1:length(ages)){
age.idx <- which(condensed.table[,"age_m"] == ages[ag])
num.in.group <- length(Reduce("intersect", list(sex.idx, genotype.idx, age.idx)))
by.age[g,ag] <- num.in.group
}
}
count.tables[[s]] <- by.age
}
#quartz(width = 9, height = 3)
par(mfrow = c(1,2), mar = c(2,8,4,2))
for(s in 1:length(sexes)){
imageWithText(count.tables[[s]], use.pheatmap.colors = TRUE,
main = sexes[s], cex = 1, main.shift = 0.15, row.text.shift = 0.1,
col.text.rotation = 0, col.text.shift = 0.15, col.text.adj = 0.5)
}
inc.idx <- which(condensed.table[,"climb_geno"] == "Inconclusive")
if(length(inc.idx) > 0){
condensed.table <- condensed.table[-inc.idx,,drop=FALSE]
}
We removed 0 entries with inconclusive genotypes.
The age groups are 4 and 12 months so far. Twenty-four months will be coming later. Here we bin the animals into those age groups.
age.groups <- c(4, 12, 24)
age.diff <- lapply(age.groups, function(x) abs(x-as.numeric(condensed.table[,"age_m"])))
age.which <- lapply(age.diff, function(x) which(x <= 1))
age.batch <- rep(NA, nrow(condensed.table))
for(i in 1:length(age.which)){
age.batch[age.which[[i]]] <- age.groups[i]
}
condensed.table <- cbind(condensed.table, "age_batch" = age.batch)
sexes <- unique(condensed.table[,"sex_ge"])
ages <- sort(as.numeric(unique(condensed.table[,"age_batch"])))
genotypes <- unique(condensed.table[,"climb_geno"])[c(2,5,4,1,3,6)]
#build a separate table for each allele and sex and count the
#mice in each age group.
count.tables <- vector(mode = "list", length = length(sexes))
names(count.tables) <- sexes
for(s in 1:length(sexes)){
sex.idx <- which(condensed.table[,"sex_climb"] == sexes[s])
by.age <- matrix(0, nrow = length(genotypes), ncol = length(ages))
rownames(by.age) <- genotypes
colnames(by.age) <- ages
for(g in 1:length(genotypes)){
genotype.idx <- grep(genotypes[g], condensed.table[,"climb_geno"])
for(ag in 1:length(ages)){
age.idx <- which(condensed.table[,"age_m"] == ages[ag])
num.in.group <- length(Reduce("intersect", list(sex.idx, genotype.idx, age.idx)))
by.age[g,ag] <- num.in.group
}
}
count.tables[[s]] <- by.age
}
full.count.table <- Reduce("cbind", count.tables)
colnames(full.count.table) <- paste(rep(names(count.tables),
each = length(count.tables)), colnames(full.count.table), sep = "_")
keep.rows <- which(!is.na(row.names(full.count.table)))
write.table(full.count.table[keep.rows,],
here("Results", "for_paper", "mouse_count_table.txt"), sep = "\t",
quote = FALSE)
The following grid shows how many animals are in each age batch. We will use the age batch as a covariate going forward.
#quartz(width = 8, height = 4)
par(mfrow = c(1,2), mar = c(2,8,4,2))
for(s in 1:length(sexes)){
imageWithText(count.tables[[s]], use.pheatmap.colors = TRUE,
main = sexes[s], cex = 1, main.shift = 0.15, row.text.shift = 0.55,
col.text.rotation = 0, col.text.shift = 0.15, col.text.adj = 0.5)
}
If there is a subgroup analysis specified in the parameter section, we select that here.
if(subgroup != "all ages"){
sub.idx <- Reduce("intersect", lapply(1:length(subgroup),
function(x) which(condensed.table[,names(subgroup)[x]] == subgroup[[x]])))
#overwrite condensed.table
if(length(sub.idx))
full.table <- condensed.table
condensed.table <- condensed.table[sub.idx,]
}
Here we use transcript count data. These data were generated by Annat Haber and Catrina Spruce. The details of the analysis can be found on Synapse with ID syn52749871.
count.num <- as.matrix(count.data[,3:ncol(count.data)])
count.id <- as.matrix(count.data[,1:2])
rownames(count.num) <- count.id[,1] #name using transcript IDs
colnames(count.num) <- gsub("X", "", colnames(count.num))
#make sure mouse IDs are aligned across the information table
#and data table.
included.idx <- match(condensed.table[,1], colnames(count.num))
#cbind(colnames(count.num)[included.idx], condensed.table[,1])
sub.count <- count.num[,included.idx]
Use DESeq2 to filter out low-expressing genes and to normalize. We use the vst normalization.
#make a data frame with relevant covariate and group data.
info <- data.frame("sequencingBatch" = as.factor(condensed.table[,"sequencingBatch"]),
"sex_ge" = as.factor(condensed.table[,"sex_ge"]),
"age_batch" = as.numeric(condensed.table[,"age_batch"]),
"climb_geno" = factor(condensed.table[,"climb_geno"],
levels = c("WT/WT", "WT/FC", "FC/FC", "WT/VS", "VS/VS"))) #put WT as the reference level
rownames(info) <- condensed.table[,"animalName"]
write.table(info, file.path(processed.data.dir, "mouse_info.csv"), sep = ",", row.names = TRUE)
# Create design matrices: Include both the condition of interest and the batch variable.
# Assuming 'genotype' is your condition and 'sequencingBatch' is the batch factor
# age_m and sex_ge are potential covariates
possible.covar <- c("sequencingBatch", "sex_ge", "age_batch", "climb_geno")
use.covar <- setdiff(possible.covar, names(subgroup))
fmla <- as.formula(paste0("~", paste(use.covar, collapse = " + ")))
design.genotype <- model.matrix(fmla, data=info)
#I tested putting different variables as the "group" variable,
#and all normalizations were identical.
dds <- DESeqDataSetFromMatrix(round(sub.count), colData = info, design = design.genotype)
## converting counts to integer mode
#nrow(dds)
smallestGroupSize <- 4
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
#nrow(dds)
sub.id <- count.id[keep,]
vsd <- vst(dds, blind = FALSE)
#pre.norm.decomp <- plot.decomp(t(assay(dds)), plot.results = FALSE)
#post.norm.decomp <- plot.decomp(t(assay(vsd)), plot.results = FALSE)
#take invariant covariates out of info
num.var <- apply(info, 2, function(x) length(unique(x)))
keep.var <- which(num.var > 1)
covar.mat <- info[,keep.var]
write.table(covar.mat, file.path(results.dir, "covariates.csv"), sep = ",", quote = FALSE)
#gather information about transcripts in filtered data set
gene.info.file <- file.path(general.data.dir, "mouse_gene_info.txt")
if(!file.exists(gene.info.file)){
library(biomaRt)
all.var <- ls()
lib.loaded <- as.logical(length(which(all.var == "mus")))
if(!lib.loaded){
mus <- useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl")
}
gene.info <- getBM(c("ensembl_gene_id", "external_gene_name", "entrezgene_id", "chromosome_name",
"start_position", "end_position"), "ensembl_gene_id", sub.id[,1], mus)
write.table(gene.info, gene.info.file, quote = FALSE, row.names = FALSE, sep = "\t")
}else{
gene.info <- read.delim(gene.info.file)
}
We looked at the relationships between the first four PCs of the normalized gene expression matrix and relevant variables. The plots are big, so these are printed to pdfs in the results folder.
top.var.alpha.thresh <- 0.05
Depending on how many of the top variable genes we use, the factors are correlated with different PCs. Here we use all genes for which the model explaining expression has a p value less than 0.05.
#top genes can be a number specifying how many of the most
#variable genes to use in the decomposition.
#gene.names can be used to specify specific genes to be
#used. This can help in identifying sample swaps in QC
plot_pc_factors <- function(expr.mat, factors, n.pc = 4, top.genes = NULL,
highlight.ind = NULL, highlight.col = "blue"){
if(!is.data.frame(factors)){stop("factors must be a data frame.")}
main.text <- ""
if(!is.null(top.genes)){
gene.var <- apply(expr.mat, 2, var)
sorted.var <- sort(gene.var, decreasing = TRUE)[1:top.genes]
expr.mat <- expr.mat[,names(sorted.var)]
main.text <- paste("\n", top.genes, "most variable genes")
}
expr.decomp <- plot.decomp(expr.mat, pc = n.pc, plot.results = FALSE)
pc.var.exp <- signif(expr.decomp$var.exp*100, 2)[1:n.pc]
pc.pairs <- pair.matrix(1:n.pc)
pc.factor.cor <- matrix(ncol = n.pc, nrow = ncol(factors))
colnames(pc.factor.cor) <- paste0("PC", 1:n.pc)
rownames(pc.factor.cor) <- colnames(factors)
#for each factor, plot out the pairs of PCs and individual PCs
for(i in 1:ncol(factors)){
cat("###", colnames(factors)[i], "\n")
#treat everything, even age as a factor
if(colnames(factors)[i] == "climb_geno"){
factor.cols <- geno.cols
factor.names <- names(factor.cols)
}else{
factor.names <- levels(as.factor(factors[,i]))
factor.cols <- c("black", "#CE5C6D", "#7ECC61")
names(factor.cols) <- factor.names
}
gcol <- rep(NA, nrow(factors))
for(g in 1:length(factor.cols)){
gcol[which(factors[,i] == names(factor.cols)[g])] <- factor.cols[g]
}
#if we are highlighting individuals, change their color here
if(!is.null(highlight.ind)){
ind.idx <- which(rownames(expr.mat) %in% highlight.ind)
gcol[ind.idx] <- highlight.col
}
if(i < ncol(factors)){
layout.mat <- get.layout.mat(nrow(pc.pairs)+n.pc+2)
}else{
layout.mat <- get.layout.mat(nrow(pc.pairs)+n.pc+3)
}
layout(layout.mat)
par(mar = c(4,4,4,2))
for(p in 1:nrow(pc.pairs)){
pc1 <- pc.pairs[p,1]
pc1.var <- pc.var.exp[pc1]
pc2 <- pc.pairs[p,2]
pc2.var <- pc.var.exp[pc2]
if(is.factor(factors[,i])){
plot(expr.decomp$u[,pc1], expr.decomp$u[,pc2], pch = 16, xlab = paste0("PC", pc1, " (", pc1.var, "%)"),
ylab = paste0("PC", pc2, " (", pc2.var, "%)"), col = gcol,
main = paste(colnames(factors)[i], main.text))
}else{
plot(expr.decomp$u[,pc1], expr.decomp$u[,pc2], pch = 16,
xlab = paste0("PC", pc1, " (", pc1.var, "%)"),
ylab = paste0("PC", pc2, " (", pc2.var, "%)"), col = gcol,
main = paste(colnames(factors)[i], main.text))
}
}
#add barplots
for(p in 1:n.pc){
if(length(levels(as.factor(factors[,i]))) > 1){
model <- lm(expr.decomp$u[,p]~factors[,i])
pc.factor.cor[i,p] <- summary(model)$adj.r.squared
}
pc.order <- order(expr.decomp$u[,p])
barplot(expr.decomp$u[pc.order,p],
col = gcol[pc.order],
main = paste("PC", p, "\n", colnames(factors)[i], main.text), border = NA)
#factors[pc.order[intersect(which(expr.decomp$u[pc.order,p] < 0), which(gcol[pc.order] == "black"))],]
#factors[pc.order[intersect(which(expr.decomp$u[pc.order,p] > 0), which(gcol[pc.order] != "black"))],]
#find mis-matches
#not.neg <- intersect(which(expr.decomp$u[,1] < 0), which(gcol == "#CE5C6D"))
#rownames(expr.mat)[not.neg]
#not.pos <- intersect(which(expr.decomp$u[,1] > 0), which(gcol == "black"))
#rownames(expr.mat)[not.pos]
}
#add legend
par(mar = c(0,0,2,2))
plot.new()
plot.window(xlim = c(0,1), ylim = c(0,1))
legend(0, 1, legend = factor.names, pch = 16, col = factor.cols)
#show correlate each PC with each factor
par(mar = c(4,4,2,2))
if(!all(is.na(pc.factor.cor[i,]))){
a <- barplot(abs(pc.factor.cor[i,]), ylab = "Adjusted R^2",
main = paste0("Adjusted R^2"), ylim = c(0,1),
names = paste0("PC", 1:n.pc, "\n", pc.var.exp, "%"))
text(a, abs(pc.factor.cor[i,])+0.04, labels = signif(abs(pc.factor.cor[i,]), 2))
#text(a, rep(0.04, length(a)), labels = paste(signif(pc.var.exp, 2), "%"))
}
cat("\n\n")
if(i == ncol(factors)){
par(mar = c(6,8,6,2))
scaled.factor.f <- t(apply(abs(pc.factor.cor), 1, function(x) x/max(x)))
imageWithText(signif(scaled.factor.f, 2), use.pheatmap.colors = TRUE, col.text.rotation = 0,
col.text.adj = 0.5, col.text.shift = 0.2, row.text.shift = 0.2, main.shift = 0.2,
main = "PC-Factor Correlations")
}
}
}
We fit a model to explain the expression of each gene with sequencing batch, sex, age, and genotype. We selected all genes that had a nominally significant p value (p < 0.05) for the entire model.
plotting.df <- data.frame("sequencingBatch" = as.factor(condensed.table[,"sequencingBatch"]),
"sex_ge" = as.factor(condensed.table[,"sex_ge"]),
"age_batch" = as.numeric(condensed.table[,"age_batch"]),
"climb_geno" = factor(condensed.table[,"climb_geno"],
levels = c("WT/WT", "WT/FC", "FC/FC", "WT/VS", "VS/VS"))) #make WT the reference levl
rownames(plotting.df) <- condensed.table[,"animalName"]
#test genes for variability based on any of the possible causal factors
#then filter for the genes that vary by these, not just variable genes
#in general
expr <- assay(vsd)
models <- apply(expr, 1, function(x) lm(x~sequencingBatch+sex_ge+age_batch+climb_geno, data = plotting.df))
model.f <- lapply(models, function(x) summary(x)$fstatistic)
model.p <- sapply(model.f, function(f) pf(f[1],f[2],f[3],lower.tail=F))
#qqunif.plot(model.p)
#big.p <- which(-log10(model.p) > 10)
sig.idx <- which(model.p < top.var.alpha.thresh)
#checked to see if VS/FC carrier status was better represented in the PCs
#It is not
#carrier.status <- rep("WT", nrow(plotting.df))
#carrier.status[grep("VS", plotting.df[,"climb_geno"])] <- "VS"
#carrier.status[grep("FC", plotting.df[,"climb_geno"])] <- "FC"
#plotting.df <- cbind(plotting.df, "carrier.status" = as.factor(carrier.status))
pdf(file.path(results.dir,
paste0("PCs_and_factors_pval_less_than_", top.var.alpha.thresh, ".pdf")), width = 12, height = 8)
plot_pc_factors(expr.mat = t(expr[sig.idx,]), factors = plotting.df, n.pc = 4)
dev.off()
quartz_off_screen 2
If we have selected to analyze all ages, the plot below shows how well genotype explains variation in PC1 of the transcription matrix at different ages.
if(subgroup == "all ages"){
expr.decomp <- plot.decomp(t(expr[sig.idx,]), plot.results = FALSE)
model <- lm(expr.decomp$u[,1]~as.factor(condensed.table[,"climb_geno"])*as.factor(condensed.table[,"age_batch"]))
#boxplot(expr.decomp$u[,1]~as.factor(condensed.table[,"climb_geno"])*as.factor(condensed.table[,"age_batch"]))
ylim <- c(min(expr.decomp$u[,1]), max(expr.decomp$u[,1]))
geno <- condensed.table[,"climb_geno"]
age <- condensed.table[,"age_batch"]
u_age <- unique(age)
#quartz(width = 10, height = 5)
par(mfrow = c(1,2), mar = c(6,4,4,2))
for(a in 1:length(u_age)){
age.idx <- which(age == u_age[a])
age.geno <- geno[age.idx]
age.pc <- expr.decomp$u[age.idx,1]
age.geno.pc <- lapply(names(geno.cols), function(x) age.pc[which(age.geno == x)])
age.model <- lm(age.pc~as.factor(age.geno))
age.r2 <- summary(age.model)$adj.r.squared
age.f <- summary(age.model)$fstatistic
age.p <- pf(age.f[1], age.f[2], age.f[3],lower.tail=F)
boxplot(age.geno.pc, names = names(geno.cols), col = geno.cols, ylab = "PC1",
main = paste0("PC1 by genotype at ", u_age[a], " months\nR2 = ",
signif(age.r2, 2), "; p = ", signif(age.p, 2)), ylim = ylim)
abline(h = 0)
}
}
This yielded 7569 genes. We used this subset of genes in the PC plots to see which factors contributed to overall variation in gene expression.
The following plots show how Klotho expression varies with age, sex, and genotype. There isn’t much variation in Klotho transcription in these animals. That’s probably fine. The Klotho variants we are dealing with are coding variants and probably don’t have any effect on gene expression.
norm.expr <- assay(vsd)
saveRDS(norm.expr, file.path(processed.data.dir, "Normalized_Expression.RDS"))
#scale expression across individuals
scaled.expr <- t(apply(norm.expr, 1, scale))
dimnames(scaled.expr) <- dimnames(norm.expr)
saveRDS(scaled.expr, file.path(processed.data.dir, "Scaled_Expression.RDS"))
if(expression.type == "transcript"){
tx.id <- sub.id[which(sub.id[,"gene_id"] == kl.ensembl),"transcript_id"]
}else{
tx.id <- sub.id[which(sub.id[,"gene_id"] == kl.ensembl),"gene_id"]
}
kl.levels <- scaled.expr[tx.id,,drop=FALSE]
if(expression.type == "transcript"){
plot.with.model(kl.levels[1,], kl.levels[2,], xlab = "Kl transcript 1",
ylab = "Kl transcript 2")
}
plot_tx_with_factor <- function(expr.mat, covar.table, tx_name, factor_name,
factor_type = "categorical", ylab = "Count",
tx_label = "Transcript", pt_col = "#c51b8a", cex.labels = 1){
#make a dummy matrix to adjust expression
factor.idx <- which(colnames(covar.table) == factor_name)
dummy.mat <- dummy_covar(as.matrix(covar.table[,-c(factor.idx)]))
for(i in 1:length(tx_name)){
if(factor_type == "categorical"){
model <- lm(expr.mat[tx_name[i],]~covar.table[,factor_name])
model.p <- signif(anova(model)$"Pr(>F)"[1], 2)
vioplot(expr.mat[tx_name[i],]~covar.table[,factor_name], xlab = "",
main = paste(tx_label[i], "\np =", model.p), col = "lightgray",
cex.names = cex.labels, cex.axis = cex.labels, ylab = "")
mtext(ylab, side = 2, line = 2.5, cex = cex.labels)
stripchart(expr.mat[tx_name[i],]~covar.table[,factor_name],
col = pt_col, vertical = TRUE, pch = 16, method = "jitter", add = TRUE)
}
if(factor_type == "numeric"){
model <- lm(expr.mat[tx_name[i],]~as.numeric(covar.table[,factor_name]))
model.p <- signif(anova(model)$"Pr(>F)"[1], 2)
vioplot(expr.mat[tx_name[i],]~as.numeric(covar.table[,factor_name]), xlab = "",
main = paste(tx_label[i], "\np =", model.p), col = "lightgray",
cex.names = cex.labels, cex.axis = cex.labels, ylab = "")
mtext(ylab, side = 2, line = 2.5, cex = cex.labels)
stripchart(expr.mat[tx_name[i],]~as.numeric(covar.table[,factor_name]),
col = pt_col, vertical = TRUE, pch = 16, method = "jitter", add = TRUE)
}
}
}
plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = tx.id, "climb_geno",
tx_label = paste0("Kl Transcript", 1:length(tx.id)),
ylab = "Expression (A.U.)", order.by.mean = FALSE)
if(subgroup[[1]] == 12){
pdf(here("Results", "for_paper",
paste0("Klotho_expression_by_genotype_", basename(results.dir), "_months.pdf")),
width = 5, height = 6)
par(bg = NA)
plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = tx.id, "climb_geno",
tx_label = paste0("Kl Transcript", 1:length(tx.id)),
ylab = "Expression (A.U.)", order.by.mean = FALSE)
dev.off()
}
## quartz_off_screen
## 2
par(mfrow = c(1,2))
plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = tx.id, factor_name = "sex_ge",
tx_label = paste0("Kl Transcript", 1:length(tx.id)),
ylab = "Expression (A.U.)")
if(subgroup[[1]] == 12){
pdf(here("Results", "for_paper",
paste0("Klotho_expression_by_sex_", basename(results.dir), "_months.pdf")),
width = 5, height = 5)
par(bg = NA)
plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = tx.id, factor_name = "sex_ge",
tx_label = paste0("Kl Transcript", 1:length(tx.id)),
ylab = "Expression (A.U.)", cex.labels = 1.5)
dev.off()
}
## quartz_off_screen
## 2
age.varies <- which(colnames(covar.mat) == "age_batch")
if(length(age.varies) > 0){
par(mfrow = c(1,2))
plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat,
tx.id, "age_batch", "numeric",
paste0("Kl Transcript", 1:length(tx.id)),
ylab = "Expression (A.U.)", cex.labels = 1.5)
}else{
plot.text("Age does not vary in this population")
}
if(length(age.varies) > 0){
pdf(here("Results", "for_paper",
paste0("Klotho_expression_by_sex_", basename(results.dir), "_months.pdf")),
width = 5, height = 5)
par(bg = NA)
age.varies <- which(colnames(covar.mat) == "age_batch")
plot_tx_with_factor(expr.mat = scaled.expr, covar.table = covar.mat,
tx.id, "age_batch", "numeric",
paste0("Kl Transcript", 1:length(tx.id)),
ylab = "Expression (A.U.)", cex.label = 1.5)
dev.off()
}
We tested for differential expression with the different factors. For each factor, we adjusted the expression matrix for the other factors and test for the final factor.
test_full_model <- function(expr.mat, covar.mat){
full.model.file <- file.path(results.dir, "Full_Model_Results.RDS")
if(!file.exists(full.model.file)){
#match covariates to adjusted expression matrix
common.ind <- intersect(colnames(expr.mat), rownames(covar.mat))
expr.idx <- match(common.ind, colnames(expr.mat))
covar.idx <- match(common.ind, rownames(covar.mat))
#adjust for sequencing batch
dummy.batch <- dummy_covar(covar.mat[covar.idx,"sequencingBatch",drop=FALSE])
adj.expr.file <- file.path(processed.data.dir, "Batch_Adjusted_Expr.RDS")
if(!file.exists(adj.expr.file)){
adj.expr <- adjust(t(expr.mat[,expr.idx]), dummy.batch)
saveRDS(adj.expr, adj.expr.file)
}else{
adj.expr <- readRDS(adj.expr.file)
}
#test one to get factor names
df <- cbind("expr" = adj.expr[,1], covar.mat)
diff.test <- lm(expr~(sex_ge+age_batch+climb_geno)^2, data = df)
factor.names <- rownames(anova(diff.test))
num.factors <- length(factor.names)-1
#test all main effects and interactions
effect.p <- matrix(NA, nrow = ncol(adj.expr), ncol = num.factors)
colnames(effect.p) <- factor.names[1:num.factors]
model.r2 <- rep(NA, ncol(adj.expr))
for(i in 1:ncol(adj.expr)){
df <- cbind("expr" = adj.expr[,i], covar.mat)
diff.test <- lm(expr~(sex_ge+age_batch+climb_geno)^2, data = df)
model.r2[i] <- summary(diff.test)$adj.r.squared
effect.p[i,] <- anova(diff.test)$"Pr(>F)"[1:num.factors]
#coef(diff.test)
}
rownames(effect.p) <- names(model.r2) <- colnames(adj.expr)
#par(mar = c(12,4,4,2)); boxplot(-log10(effect.p), las = 2, ylab = "-log10(p)")
#plot.decomp(t(-log10(effect.p)), label.points = TRUE)
result <- list("p_values" = effect.p, "R2" = model.r2)
saveRDS(result, full.model.file)
}else{
result <- readRDS(full.model.file)
}
return(result)
}
We first used a linear model to test for main effects of each covariate along with interactions between the covariates.
full.model.result <- test_full_model(expr.mat = scaled.expr, covar.mat = plotting.df)
effect.p <- full.model.result$p_values
model.r2 <- full.model.result$R2
#take out the genes with the largest sex effect. These are all X and Y genes.
sex.effect.threshold = 10
big.sex.effect <- which(-log10(effect.p[,"sex_ge"]) > sex.effect.threshold)
big.sex.genes <- gene.info[match(names(big.sex.effect), gene.info[,"ensembl_gene_id"]),]
effect.p <- effect.p[-big.sex.effect,]
we removed the genes with the sex effects greater than 10. There were 12 with effects this large. They are shown below and reside mostly on the sex chromosomes.
colnames(big.sex.genes) <- c("geneID", "geneName","entrezID", "Chr", "start", "end")
datatable(big.sex.genes)
The box plot below shows the distributions of the effects of each factor with the sex-specific genes removed.
par(mar = c(12,4,4,2))
boxplot(-log10(effect.p), las = 2, ylab = "-log10(p)")
The following are qq plots for each p value distribution. These are interesting. It looks as if almost every gene’s transcription is affected by Klotho genotype. There are many that are affected by age. There is a large effect of sex, but not as big as the age or genotype effect.
There really arent any genes with an interaction effect between sex and genotype. Same for the interaction between sex and genotype. This is unexpected.
There are many more genes that have a sex by age interaction.
effect.fdr <- apply(effect.p, 2, function(x) p.adjust(x, "fdr"))
model.r2 <- model.r2[-big.sex.effect]
layout(get.layout.mat(ncol(effect.p)))
for(i in 1:ncol(effect.p)){
qqunif.plot(effect.p[,i], plot.label = colnames(effect.p)[i])
}
The following bar plot shows the number of genes with significant effects due to each factor with an FDR less than 0.1.
Klotho genotype affected expression of the most genes, followed by age. Sex affected very few, which I think is consistent with previous results in the brain.
A handful of genes had a sex by age interaction, and two had a genotype by age interaction.
factor.fdr <- apply(effect.p, 2, function(x) p.adjust(x, "fdr"))
sig.factor <- apply(factor.fdr, 2, function(x) which(x <= fdr.thresh))
#sig.factor <- apply(effect.p, 2, function(x) which(x <= 0.01))
num.sig <- sapply(sig.factor, length)
par(mar = c(12, 4, 4, 2))
barplot_with_num(num.sig, las = 2, ylab = "Count")
saveRDS(num.sig,
here("Results", "for_paper", paste0("num_de_", basename(results.dir), ".RDS")))
The following heat map shows enrichments for genes that were nominally significant for each main effect or interaction.
enrich <- lapply(sig.factor, function(x) gost(names(x), organism = "mmusculus", sources = c("GO", "KEGG", "CORUM", "HP", "REACTOME")))
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
plot.enrichment.group(enrich, max.term.size = 3000, transformation = sqrt, max.char = 100)
Interaction plots for any genes with significant age by genotype interactions are printed to the results folder for this run in a pdf called age_geno_int.pdf.
adj.expr <- readRDS(file.path(processed.data.dir, "Batch_Adjusted_Expr.RDS"))
age.geno.col <- which(colnames(factor.fdr) == "age_batch:climb_geno")
if(length(age.geno.col) > 0){
sig.idx <- which(factor.fdr[,age.geno.col] <= fdr.thresh)
if(length(sig.idx) > 0){
pdf(file.path(results.dir, "age_geno_int.pdf"), width = 7, height = 5)
for(i in 1:length(sig.idx)){
gene.id <- rownames(factor.fdr)[sig.idx[i]]
model <- lm(adj.expr[,gene.id]~plotting.df[,"age_batch"]*plotting.df[,"climb_geno"])
#summary(model)
#anova(model)
#gene.id <- sample(rownames(factor.fdr), 1)
gene.name <- gene.info[which(gene.info[,"ensembl_gene_id"] == gene.id),"external_gene_name"]
interaction.plot(plotting.df[,"age_batch"], plotting.df[,"climb_geno"], adj.expr[,gene.id], col = geno.cols, lwd = 3,
xlab = "Age", ylab = "Expression", main = gene.name, trace.label = "genotype")
}
dev.off()
}
}
Interaction plots for genes with significant sex by genotype interactions are printed to the results folder for this run in a pdf called sex_geno_int.pdf.
sex.geno.col <- which(colnames(factor.fdr) == "sex_ge:climb_geno")
if(length(sex.geno.col) > 0){
sig.idx <- which(factor.fdr[,sex.geno.col] <= fdr.thresh)
if(length(sig.idx) > 0){
pdf(file.path(results.dir, "sex_geno_int.pdf"), width = 7, height = 5)
for(i in 1:length(sig.idx)){
gene.id <- rownames(factor.fdr)[sig.idx[i]]
model <- lm(adj.expr[,gene.id]~plotting.df[,"sex_ge"]*plotting.df[,"climb_geno"])
#summary(model)
#anova(model)
#gene.id <- sample(rownames(factor.fdr), 1)
gene.name <- gene.info[which(gene.info[,"ensembl_gene_id"] == gene.id),"external_gene_name"]
interaction.plot(plotting.df[,"sex_ge"], plotting.df[,"climb_geno"], adj.expr[,gene.id], col = geno.cols, lwd = 3,
xlab = "Sex", ylab = "Expression", main = gene.name, trace.label = "genotype")
}
dev.off()
}
}
## quartz_off_screen
## 2
de.files <- get.files(here("Results", "for_paper"), want = c("num_de", "RDS"), full.names = TRUE)
factor.labels <- c("sex_ge" = "sex", "climb_geno" = "Klotho genotype", "age_batch" = "age")
if(length(de.files) == 3){
num.de <- lapply(de.files, readRDS)
names(num.de) <- gsub(".RDS", "", gsub("num_de_", "", basename(de.files)))
all.factors <- Reduce("union", lapply(num.de, names))
count.table <- matrix(NA, ncol = length(all.factors), nrow = length(num.de))
colnames(count.table) <- all.factors
rownames(count.table) <- names(num.de)
for(i in 1:length(num.de)){
count.table[i,names(num.de[[i]])] <- num.de[[i]]
}
#for now, make group a column, so we can turn it into a LaTeX table more easily
count.table <- cbind("group" = names(num.de), count.table)
for(i in 1:length(factor.labels)){
colnames(count.table) <- gsub(names(factor.labels)[i], factor.labels[i], colnames(count.table))
}
write.table(count.table, here("Results", "for_paper", "num_de_table.txt"),
row.names = FALSE, na = "--", quote = FALSE, sep = "\t")
#cd to ~/Documents/git_repositories/LaTeX-Table-Generator
#run python3 table_generator.py
#move the result from Outputs to the for_paper directory
#open the file and paste the table into the manuscript document
#paste the result into LaTeXit to test if needed.
}
The heatmap below shows the mean expression of differentially expressed genes for each genotype. This is with an fdr value of 0.1.
sig.geno.idx <- which(effect.fdr[,"climb_geno"] <= fdr.thresh)
sig.expr <- scaled.expr[names(sig.geno.idx),]
#divide into groups based on carrier status
alleles <- genotypes[1:5]
allele.idx <- lapply(alleles, function(x) which(info[,"climb_geno"] == x))
diff.expr <- lapply(allele.idx, function(x) sig.expr[,x])
mean.sig.expr <- sapply(diff.expr, rowMeans)
colnames(mean.sig.expr) <- alleles
expr.clust <- pam(mean.sig.expr, k = 2)
col.order <- order(expr.clust$clustering)
annot.col <- data.frame(as.factor(expr.clust$clustering))
colnames(annot.col) <- "cluster"
pheatmap(t(mean.sig.expr[col.order,ordered.geno]), show_colnames = FALSE, cluster_cols = FALSE,
cluster_rows = FALSE, annotation_col = annot.col)
gene.names <- mouse.gene.table[match(rownames(annot.col), mouse.gene.table[,"ensembl_gene_id"]), "external_gene_name"]
write.table(cbind(gene.names, annot.col)[order(annot.col[,1]),], file.path(results.dir, "DEG_clusters.txt"),
sep = "\t", quote = FALSE)
if(subgroup[[1]] == 12){
cluster.id <- expr.clust$clustering
png(here("Results", "for_paper", "deg_clusters.png"), width = 7, height = 3, units = "in", res = 300)
scaled.mat <- t(apply(mean.sig.expr[col.order,ordered.geno], 1, scale))
dimnames(scaled.mat) <- dimnames(mean.sig.expr[,ordered.geno])
layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.2))
par(mar = c(2,4,4,1), xpd = NA)
imageWithText(t(scaled.mat), show.text = FALSE, use.pheatmap.colors = TRUE,
col.names = NULL, row.text.shift = 0.07, row.text.adj = 0.5)
plot.dim <- par("usr")
plot.height <- plot.dim[4] - plot.dim[3]
plot.width <- plot.dim[2] - plot.dim[1]
c1.mid <- mean(which(cluster.id[col.order] == 1))
c2.mid <- mean(which(cluster.id[col.order] == 2))
label.y <- (plot.dim[4]+(plot.height*0.09))
text(x = c1.mid, y = label.y, labels = "Neuron and Synapse")
text(x = c2.mid, y = label.y, labels = "Mitochondria and Ribosome")
#add colored labels for the genotypes
y.vals <- 1:length(ordered.geno)
y.shift <- plot.height*0.06
label.xmin <- plot.dim[1] - (plot.width*0.09)
label.xmax <- plot.dim[1] + (plot.width*0.03)
for(i in 1:length(y.vals)){
draw.rectangle(label.xmin, label.xmax, (y.vals[i] - y.shift), (y.vals[i] + y.shift),
border = geno.cols[match(rev(ordered.geno)[i], names(geno.cols))],
lwd = 3)
}
#add scale bar
par(mar = c(2,4,4,1))
imageWithTextColorbar(mean.sig.expr, use.pheatmap.colors = TRUE, cex = 1, bar.lwd = 3)
dev.off()
}
## quartz_off_screen
## 2
We clustered these genes into two groups and looked at enrichment. The enrichments of the groups are shown below.
group.genes <- lapply(1:2, function(x) rownames(annot.col)[which(annot.col[,1] == x)])
group.enrich = lapply(group.genes, function(x) gost(x, organism = "mmusculus",
sources = c("GO", "KEGG", "REACOME", "CORUM", "HP", "WP")))
names(group.enrich) <- c(1,2)
plot.enrichment.group(group.enrich, max.term.size = 3000, n.terms = 15)
if(subgroup[[1]] == 12){
pdf(here("Results", "for_paper", "Enrichment.pdf"), width = 8, height = 7)
for(i in 1:length(group.enrich)){
plot.enrichment(group.enrich[[i]], plot.label = paste("Cluster", i),
max.term.size = 3000, num.terms = 15)
}
dev.off()
}
## quartz_off_screen
## 2
The following correlation matrix heat map shows that the VS homo- and heterozygote effects are correlated with each other, as are the FC homo- and heterozygote effects. The VS and FC effects are negatively correlated with each other, meaning that when the FC genotype has increased expression of a gene, the VS genotype tends to have decreased expression of that gene.
The FC genotypes are more correlated with each other than the VS genotypes suggesting that the heterozygote and homozygote of the VS alleles have different effects more often than the homo- and heterozygote of the FC alleles.
The WT mice are also more correlated with the FC than with the VS animals.
pheatmap(cor(mean.sig.expr), display_numbers = TRUE)
The following table shows functional enrichments for the group of genes that are differentially expressed across Klotho genotypes.
sig.enrich <- gost(rownames(mean.sig.expr), organism = "mmusculus",
sources = c("GO", "KEGG", "REACTOME", "CORUM", "HP", "HPA"))
plot.enrichment(sig.enrich, plot.label = "DEG by Klotho genotype", num.terms = 30,
max.term.size = 3000)
The following plot is a wordcloud version of the above data.
#pdf("~/Desktop/overall_enrich.pdf", width = 12, height = 7)
par(mfrow = c(1,2), mar = c(0,0,0,0))
plot.enrichment.wordcloud(sig.enrich, plot.label = "DEG by Klotho genotype", num.terms = 20,
max.term.size = 3000, just.wordcloud = FALSE)
#dev.off()
The following plot shows that compared with WT mice, the FC and VS alleles have opposing effects on many genes.
vs.idx <- grep("VS", colnames(mean.sig.expr))
vs.mean <- rowMeans(mean.sig.expr[,vs.idx])
fc.idx <- grep("FC", colnames(mean.sig.expr))
fc.mean <- rowMeans(mean.sig.expr[,fc.idx])
plot(vs.mean, fc.mean, xlab = "VS mean expression",
ylab = "FC mean expression")
abline(h = 0, v = 0)
The following plot shows the enrichments for the genes in each quadrant above.
quads <- pair.matrix(c("up", "down"), TRUE, TRUE)
enrich.idx <- vector(mode = "list", length = nrow(quads))
for(i in 1:nrow(quads)){
if(quads[i,1] == "up"){
x.idx <- which(vs.mean > 0)
}else{
x.idx <- which(vs.mean < 0)
}
if(quads[i,2] == "up"){
y.idx <- which(fc.mean > 0)
}else{
y.idx <- which(fc.mean < 0)
}
enrich.idx[[i]] <- intersect(x.idx, y.idx)
}
quad.enrich <- lapply(enrich.idx, function(x) gost(names(vs.mean)[x],
organism = "mmusculus", sources = c("GO", "KEGG", "REACTOME", "CORUM", "HP")))
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
#pdf("~/Desktop/enrichment.pdf", width = 10, height = 10)
layout.matrix <- matrix(c(4,1,3,2), nrow = 2, byrow = TRUE)
layout(layout.matrix)
par(mar = c(4,0,4,0))
for(i in 1:length(quad.enrich)){
if(i == 3){max.cex = 2}else{max.cex = 2.5}
plot.enrichment.wordcloud(quad.enrich[[i]], max.term.size = 3000,
plot.label = "", just.wordcloud = TRUE, max.cex = max.cex, num.terms = 30)
mtext(paste0("VS ", quads[i,1], "; FC ", quads[i,2]), side = 3, font = 2)
plot.dim <- par("usr")
draw.rectangle(plot.dim[1],plot.dim[2],plot.dim[3],plot.dim[4])
}
#dev.off()
We are interested in how Klotho genotype affects expression of genes that have previously been associated with Alzheimer’s disease. We downloaded a list of AD-associated genes from Agora: https://agora.adknowledgeportal.org/genes/nominated-targets This is their list of nominated targets.
We can separate the genes by evidence, e.g. gene expression, protein levels, genetic evidence, etc. Here we look at all AD genes.
#ad.gene.list <- read.delim(here("Data", "Human", "AD_genes.txt"), header = FALSE) #from book
ad.gene.list <- read.csv(here("Data", "Human", "gene-list.csv"), comment.char = "#")
#we can filter by a number of features:
#filter to those with RNA evidence
#has.rna <- grep("RNA", ad.gene.list[,"Input.Data"])
#ad.gene.list <- ad.gene.list[has.rna,]
#pick genes with more than one nomination
#multiple.noms <- which(ad.gene.list[,"Nominations"] > 1)
#ad.gene.list <- ad.gene.list[multiple.noms,]
common.ad <- intersect(ad.gene.list[,1], orthos[,"Human.Gene.Name"])
ad.orthos <- orthos[match(common.ad, orthos[,"Human.Gene.Name"]),"Mouse.Ortholog.Name"]
ad.ensembl <- intersect(gene.info[match(intersect(ad.orthos, gene.info[,"external_gene_name"]),
gene.info[,"external_gene_name"]), "ensembl_gene_id"], rownames(effect.fdr))
ad.sig <- intersect(rownames(mean.sig.expr), ad.ensembl)
The following plot shows how the differentially expressed genes and AD-associated genes interact with each other.
sig.list <- list("All genes" = rownames(effect.fdr), "DEG" = rownames(mean.sig.expr),
"AD Genes" = ad.ensembl)
#plotVenn(Vlist = sig.list)
circle.cols <- c("all" = "black", "deg" = "#bf812d", "ad" = "#756bb1")
ind.num <- sapply(sig.list, length)
deg.only <- length(setdiff(sig.list$DEG, sig.list$"AD Genes"))
ad.only <- length(setdiff(sig.list$"AD Genes", sig.list$DEG))
deg.and.ad <- length(intersect(sig.list$"AD Genes", sig.list$DEG))
par(mar = c(0,0,0,0))
plot.new()
plot.window(xlim = c(0, 1), ylim = c(0,1))
inner.center <- 0.45
inner.label.y <- 0.62
deg.center <- 0.4
ad.center <- 0.7
label.cex <- 1.2
#all.circle <- get_circle(radius = 0.45, center_x = 0.5, center_y = inner.center)
#points(all.circle$x, all.circle$y, col = circle.cols["all"], type = "l", lwd = 3)
draw.rectangle(min.x = 0.12, max.x = 0.92, min.y = 0.15, max.y = 0.85, border = circle.cols["all"], lwd = 3)
deg.circle <- get_circle(radius = 0.25, center_x = 0.4, center_y = inner.center)
points(deg.circle$x, deg.circle$y, col = circle.cols["deg"], type = "l", lwd = 3)
ad.circle <- get_circle(radius = 0.15, center_x = 0.7, center_y = inner.center)
points(ad.circle$x, ad.circle$y, col = circle.cols["ad"], type = "l", lwd = 3)
text(x = 0.52, y = 0.81, labels = paste0("All (", ind.num["All genes"], ")"),
col = circle.cols["all"], font = 2, cex = label.cex)
text(x = deg.center, y = inner.label.y, labels = paste0("DEG (", ind.num["DEG"], ")"),
col = circle.cols["deg"], font = 2, cex = label.cex)
text(x = ad.center+0.035, y = inner.label.y, labels = paste0("AD (", ind.num["AD Genes"], ")"),
col = circle.cols["ad"], font = 2, cex = label.cex)
text(x = deg.center, y = inner.center, labels = deg.only, font = 2, cex = label.cex, col = circle.cols["deg"])
text(x = ad.center+0.02, y = inner.center, labels = ad.only, font = 2, cex = label.cex, col = circle.cols["ad"])
text(x = 0.6, y = inner.center, labels = deg.and.ad, font = 2, cex = label.cex, col = mix_colors(circle.cols["deg"], circle.cols["ad"]))
Are AD genes enriched among the differentially expressed genes? Are there more AD genes among the DEG than we would expect by chance? We used the hypergeometric distribution to answer this.
We have an urn with \(N\) total marbles. Some of these marbles (\(K\) red marbles) are differentially expressed. \(N-K\) green marbles are not differentially expressed. If we draw \(n\) AD genes, what is the probability that exaclty \(k\) of them are red (i.e. differentially expressed)?
. We want to know the probability of drawing \(k\)
#quartz(width = 5, height = 5)
plot.new()
par(mar = c(1,1,1,1))
plot.window(xlim = c(0, 1), ylim = c(0, 1))
draw.rectangle(0.1, 0.9, 0.1, 0.9)
gene.circle <- get_circle(radius = 0.25, center_x = 0.4, center_y = 0.5)
lower.idx <- which(gene.circle$y <= 0.5)
upper.idx <- which(gene.circle$y >= 0.5)
plot.poly.xy(gene.circle$x[upper.idx], gene.circle$y[upper.idx],
gene.circle$x[lower.idx], gene.circle$y[lower.idx], border = NA,
col = rgb(190/256,174/256,212/256))
points(gene.circle$x, gene.circle$y, type = "l")
ad.circle <- get_circle(radius = 0.25, center_x = 0.6, center_y = 0.5)
lower.idx <- which(ad.circle$y <= 0.5)
upper.idx <- which(ad.circle$y >= 0.5)
plot.poly.xy(ad.circle$x[upper.idx], ad.circle$y[upper.idx],
ad.circle$x[lower.idx], ad.circle$y[lower.idx], border = NA,
col = rgb(127/256,201/256,127/256, alpha = 0.5))
points(ad.circle$x, ad.circle$y, type = "l")
#add labels
text(x = 0.72, y = 0.12, labels = "All genes = N", adj = 0)
text(x = 0.25, y = 0.76, labels = "DEG = K")
text(x = 0.83, y = 0.76, labels = "AD genes = n", adj = 1)
text(x = 0.5, y = 0.5, labels = "AD and DEG = k")
mtext("What is the probability that in our draw of AD genes\n
we get k that are differentially expressed?",
side = 1, line = -2.5, padj = 0)
#from: https://montilab.github.io/BS831/articles/docs/HyperEnrichment.html
all.gene.id <- gsub(".value", "", rownames(full.model.result$p_values))
all.gene.name <- gene.info[match(all.gene.id, gene.info[,"ensembl_gene_id"]), "external_gene_name"]
sig.deg <- rownames(sig.expr)
#ad.orthos
p.of.draw <- phyper(q = length(ad.sig)-1, #number of red marbles in our draw (number of AD genes that are DEG)
m = length(sig.deg), #number of red marbles (DEG) in the urn
n = length(all.gene.id) - length(sig.deg), #number of green marbles (not DEG)
k = length(ad.orthos), #number of drawn marbles (AD genes)
lower.tail = FALSE #compute P(X > 1 overlap), hence the -1 above.
)
The calculated probability of there 260 differentially expressed AD genes without enrichment in the set of all DEGs is 2^{-19}, which we correct to 2.2^{-16}.
if(subgroup[[1]] == 12){
cluster.id <- expr.clust$clustering
png(here("Results", "for_paper", "deg_clusters.png"), width = 7, height = 3.5, units = "in", res = 300)
scaled.mat <- t(apply(mean.sig.expr[col.order,ordered.geno], 1, scale))
dimnames(scaled.mat) <- dimnames(mean.sig.expr[,ordered.geno])
layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.2))
par(mar = c(2,4,4,1), xpd = NA)
imageWithText(t(scaled.mat), show.text = FALSE, use.pheatmap.colors = TRUE,
col.names = NULL, row.text.shift = 0.07, row.text.adj = 0.5)
plot.dim <- par("usr")
plot.height <- plot.dim[4] - plot.dim[3]
plot.width <- plot.dim[2] - plot.dim[1]
c1.idx <- which(cluster.id[col.order] == 1)
c2.idx <- which(cluster.id[col.order] == 2)
c1.mid <- mean(c1.idx)
c2.mid <- mean(c2.idx)
label.y <- (plot.dim[4]+(plot.height*0.09))
text(x = c1.mid, y = label.y, labels = "Mitochondria and Ribosome")
text(x = c2.mid, y = label.y, labels = "Neuron and Synapse")
#add colored labels for the genotypes
y.vals <- 1:length(ordered.geno)
y.shift <- plot.height*0.06
label.xmin <- plot.dim[1] - (plot.width*0.09)
label.xmax <- plot.dim[1] + (plot.width*0.03)
for(i in 1:length(y.vals)){
draw.rectangle(label.xmin, label.xmax, (y.vals[i] - y.shift), (y.vals[i] + y.shift),
border = geno.cols[match(rev(ordered.geno)[i], names(geno.cols))],
lwd = 3)
}
#add markes for AD genes
ad.id <- gene.info[match(ad.orthos, gene.info[,"external_gene_name"]), "ensembl_gene_id"]
to.mark <- intersect(ad.id[which(!is.na(ad.id))], rownames(scaled.mat))
ad.idx <- match(to.mark, rownames(scaled.mat))
text(x = ad.idx, y = 0.35, labels = "|", col = circle.cols["ad"])
text(x = -65, y = 0.35, labels = "AD gene", adj = 1)
c1.ad.perc <- round(length(intersect(ad.idx, c1.idx))/length(c1.idx)*100)
c2.ad.perc <- round(length(intersect(ad.idx, c2.idx))/length(c2.idx)*100)
text(x = c1.mid, y = 0, labels = paste0(c1.ad.perc, "% AD"))
text(x = c2.mid, y = 0, labels = paste0(c2.ad.perc, "% AD"))
#add scale bar
par(mar = c(2,4,4,1))
imageWithTextColorbar(mean.sig.expr, use.pheatmap.colors = TRUE, cex = 1, bar.lwd = 3)
dev.off()
}
## quartz_off_screen
## 2
sig.info <- gene.info[match(rownames(mean.sig.expr), gene.info[,"ensembl_gene_id"]),]
expr.table <- mean.sig.expr
colnames(expr.table) <- paste0(colnames(expr.table), "_mean_expression")
is.agora <- rep("no", nrow(sig.info))
is.agora[which(sig.info[,"external_gene_name"] %in% ad.orthos)] <- "yes"
supp.table <- cbind(sig.info, expr.table, "Agora_target" = is.agora,
"Cluster_ID" = cluster.id[rownames(mean.sig.expr)])
write.table(supp.table, here("Results", "for_paper", "Supp_Table_DEG.csv"),
sep = ",", quote = FALSE, row.names = FALSE)
There were multiple individual genes of interest in this list:
APOE was upregulated in the VS animals reltaive to WT and FC animals. APOE mRNA has been shown to be more abundant in AD brains (PMID: 7751846, 27104063), so this possibly contradicts the protective effect of the VS allele.
Celf1 is also low in the VS animals relative to the FC animals. It has been previously shown that CELF1 expression is low in AD brains (mentioned in PMID: 38768546), so this also possibly contradicts the protective effect of the VS allele.
Trem2 is downregulated in the FC animals relative to the others. This could be protective since Trem2 is more highly expressed in AD brains and may be associated with increased macrophage recruitment (PMID: 33516818).
So we see a mixed bag with expression of individual genes. Thus far, the VS mice do not look as if their brains have less AD-like expression.
ad.interest <- c("App", "Apoe", "Celf1", "Trem2")
for(i in 1:length(ad.interest)){
cat("###", ad.interest[i], "\n")
gene.id <- gene.info[which(gene.info[,"external_gene_name" ]== ad.interest[i]), "ensembl_gene_id"]
test <- plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = gene.id, tx_label = ad.interest[i], ylab = "Expression (A.U.)",
order.by.mean = FALSE, plot.results = TRUE)
cat("\n\n")
}
if(subgroup[[1]] == 12){
pdf(here("Results", "for_paper", "AD-related_expression.pdf"), width = 7, height = 7)
for(i in 1:length(ad.interest)){
gene.id <- gene.info[which(gene.info[,"external_gene_name" ]== ad.interest[i]), "ensembl_gene_id"]
test <- plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = gene.id, tx_label = ad.interest[i], ylab = "Expression (A.U.)",
order.by.mean = FALSE, plot.results = TRUE, cex.labels = 1.5)
}
dev.off()
}
## quartz_off_screen
## 2
We also looked at differential expression based on the biodomains described in PMID: 38650747. The lists are stored at syn25760039. The synIDs for the individual files are listed in Synapse_IDs_biodomain_gene_lists.csv.
There is also a table stored at syn25428992, that annotates the GO terms. Each row in this table is a GO term. The information in each row tells you which biodomain and which subdomain the GO term is included in. This is how Greg makes those boxplots in which each dot is a GO term, and the position is the normalized enrichment score from GSEA. The dots are grouped by domain. I am not going to worry about the subdomains right now, but it will be handy to have the lists of genes in each GO term.
For now, I will make a list of genes in each domain, translate those to mouse orthologs and test for enrichment among differentially expressed genes.
biodomain.dir <- here("Data", "Biodomains")
if(!file.exists(biodomain.dir)){dir.create(biodomain.dir)}
#make a list of the genes in each biodomain
file.dest <- file.path(biodomain.dir, "annotated_biodomains_Oct23.rds")
if(!file.exists(file.dest)){
synGet(syn25428992, downloadLocation = biodomain.dir)
}
annotated_bd <- readRDS(file.path(biodomain.dir, "annotated_biodomains_Oct23.rds"))
u_bd <- unique(annotated_bd$Biodomain)
bd_genes <- vector(mode = "list", length = length(u_bd))
names(bd_genes) <- u_bd
for(bd in 1:length(u_bd)){
bd.idx <- which(annotated_bd$Biodomain == u_bd[bd])
bd_genes[[bd]] <- Reduce("union", annotated_bd$ensembl_id[bd.idx])
}
saveRDS(bd_genes, file.path(general.processed.data.dir, "Human_Biodomains_for_GSEA.RDS"))
#also gather subdomains in a nested list
sbd_genes <- vector(mode = "list", length = length(u_bd))
names(sbd_genes) <- u_bd
for(bd in 1:length(u_bd)){
bd.idx <- which(annotated_bd$Biodomain == u_bd[bd])
sbd.names <- unique(annotated_bd$Subdomain[bd.idx])
sbd.names <- sbd.names[which(!is.na(sbd.names))]
if(length(sbd.names) > 0){
sbd_list <- vector(mode = "list", length = length(sbd.names))
names(sbd_list) <- sbd.names
for(sbd in 1:length(sbd.names)){
sbd.idx <- which(annotated_bd$Subdomain == sbd.names[sbd])
sbd_list[[sbd]] <- Reduce("union", annotated_bd$ensembl_id[sbd.idx])
}
sbd_genes[[bd]] <- sbd_list
}
}
saveRDS(sbd_genes, file.path(general.processed.data.dir, "Human_Subdomains_for_GSEA.RDS"))
The biodomains are in terms of human genes. We need to translate these to mouse orthologs.
mouse.bd.genes <- vector(mode = "list", length = length(bd_genes))
names(mouse.bd.genes) <- names(bd_genes)
for(i in 1:length(bd_genes)){
bd.with.ortho <- intersect(bd_genes[[i]], orthos[,"Human.Ensembl"])
bd.idx <- match(bd.with.ortho, orthos[,"Human.Ensembl"])
mouse.bd.genes[[i]] <- orthos[bd.idx,c("Mouse.Ortholog.Ensembl", "Mouse.Ortholog.Name")]
}
bd.for.gsea <- lapply(mouse.bd.genes, function(x) x[,1])
saveRDS(bd.for.gsea, file.path(general.processed.data.dir, "Mouse_Biodomains_for_GSEA.RDS"))
#do the same for subdomains
mouse.sbd.genes <- vector(mode = "list", length = length(sbd_genes))
names(mouse.sbd.genes) <- names(sbd_genes)
for(i in 1:length(sbd_genes)){
if(length(sbd_genes[[i]]) > 0){
sbd.list <- vector(mode = "list", length = length(sbd_genes[[i]]))
names(sbd.list) <- names(sbd_genes[[i]])
for(s in 1:length(sbd.list)){
sbd.with.ortho <- intersect(sbd_genes[[i]][[s]], orthos[,"Human.Ensembl"])
sbd.idx <- match(sbd.with.ortho, orthos[,"Human.Ensembl"])
sbd.list[[s]] <- orthos[sbd.idx,c("Mouse.Ortholog.Ensembl", "Mouse.Ortholog.Name")]
}
mouse.sbd.genes[[i]] <- sbd.list
}
}
sbd.for.gsea <- lapply(mouse.sbd.genes, function(x) lapply(x, function(y) y[,1]))
saveRDS(sbd.for.gsea, file.path(general.processed.data.dir, "Mouse_Subdomains_for_GSEA.RDS"))
#also save gene lists for all GO terms in each biodomain
#This is how Greg Cary makes his enrichment boxplots. He
#gets a normalized enrichment score for each GO term and plots
#all those together for each biodomain
#take any GO term with at least 3 genes
mouse_go_list <- human_go_list <- vector(mode = "list", length = length(bd_genes))
names(mouse_go_list) <- names(human_go_list) <- names(bd_genes)
for(bd in 1:length(bd_genes)){
bd.idx <- which(annotated_bd$Biodomain == names(bd_genes)[bd])
bd_go <- annotated_bd$GOterm_Name[bd.idx]
go_genes <- annotated_bd$ensembl_id[bd.idx]
names(go_genes) <- bd_go
num.genes <- sapply(go_genes, length)
human_go_genes <- go_genes[which(num.genes >= min.term.size)]
dups <- which(duplicated(names(human_go_genes)))
dup.idx <- lapply(dups, function(x) which(names(human_go_genes) == names(human_go_genes)[x]))
keep.idx <- unique(sapply(dup.idx, function(x) x[1]))
delete.idx <- setdiff(unique(unlist(dup.idx)), keep.idx)
no.dups <- setdiff(1:length(human_go_genes), delete.idx)
human_go_list[[bd]] <- human_go_genes[no.dups]
common_go_genes <- lapply(human_go_genes[no.dups], function(x) intersect(x, orthos[,"Human.Ensembl"]))
mouse_go_genes <- lapply(common_go_genes, function(x) orthos[match(x, orthos[,"Human.Ensembl"]),"Mouse.Ortholog.Ensembl"])
num.mouse <- sapply(mouse_go_genes, length)
mouse_go_list[[bd]] <- mouse_go_genes[which(num.mouse >= min.term.size)]
}
saveRDS(mouse_go_list, file.path(general.processed.data.dir, "Mouse_Biodomains_sub_GO_for_GSEA.RDS"))
saveRDS(human_go_list, file.path(general.processed.data.dir, "Human_Biodomains_sub_GO_for_GSEA.RDS"))
The following heat maps show the mean expression of genes across each biodomain. Each has been clustered by medoid clustering into the best number of clusters (always 2). The two variants seem to have opposing expression relative to the WT. This seems like an artefactual pattern. I’m not sure why it is so consistent.
mean.bd.vals <- vector(mode = "list", length = length(mouse.bd.genes))
names(mean.bd.vals) <- names(mouse.bd.genes)
for(bd in 1:length(mouse.bd.genes)){
bd.vals <- lapply(1:nrow(mouse.bd.genes[[bd]]),
function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = mouse.bd.genes[[bd]][x,1],
tx_label = mouse.bd.genes[[bd]][x,2],
ylab = "Exression (A.U.)", plot.results = FALSE)[[1]])
has.vals <- which(sapply(bd.vals, length) > 0)
bd.mean <- t(sapply(bd.vals[has.vals], function(x) sapply(x[[1]], mean)))
rownames(bd.mean) <- sapply(bd.vals[has.vals], names)
mean.bd.vals[[bd]] <- bd.mean
}
#In previous runs I tried 2 to 10 clusters. All ended up with 2,
#so I am just setting k = 2 here to save time, but leaving the
#code for testing for future work.
clustered.gene.file <- file.path(results.dir, "Clustered_Coef.RDS")
clustered.enrich.file <- file.path(results.dir, "Clustered_Enrich.RDS")
if(!file.exists(clustered.gene.file)){
#k.seq = 2:4
clustered.genes <- gene.enrich <- vector(mode = "list", length = length(mean.bd.vals))
names(clustered.genes) <- names(gene.enrich) <- names(mean.bd.vals)
for(i in 1:length(mean.bd.vals)){
#test.k <- test.pam.k(mean.bd.vals[[i]], plot.results = FALSE, kseq = k.seq)
#mean.cl.width <- sapply(test.k$cl.width, mean)
#barplot(mean.cl.width)
#select the number of clusters with the best separation among clusters.
#k = as.numeric(names(mean.cl.width)[which.max(mean.cl.width)])
k = 2
final.clust <- pam(mean.bd.vals[[i]], k = k)
clustered.genes[[i]] <- final.clust$clustering
cluster.enrich <- lapply(1:k, function(x) gost(names(clustered.genes[[i]])[which(clustered.genes[[i]] == x)],
organism = "mmusculus", source = c("GO", "KEGG", "REACTOME", "HP", "CORUM")))
names(cluster.enrich) <- paste0("cluster", 1:k)
gene.enrich[[i]] <- cluster.enrich
#plot.enrichment.group(cluster.enrich, max.term.size = 3000, n.terms = 30, cluster_cols = FALSE, plot.label = names(mean.bd.vals)[i])
}
saveRDS(clustered.genes, clustered.gene.file)
saveRDS(gene.enrich, clustered.enrich.file)
}else{
clustered.genes <- readRDS(clustered.gene.file)
gene.enrich <- readRDS(clustered.enrich.file)
}
bd.means <- matrix(nrow = length(mean.bd.vals), ncol = 5)
rownames(bd.means) <- names(mean.bd.vals)
colnames(bd.means) <- genotypes[1:5]
#pdf("~/Desktop/test.pdf", width = 8, height = 4)
for(i in 1:length(mean.bd.vals)){
cat("###", names(mean.bd.vals)[i], "\n")
#pheatmap(t(mean.bd.vals[[i]][names(clustered.genes[[i]])[order(clustered.genes[[i]])],]),
# cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE, main = names(mean.bd.vals)[i])
par(mfrow = c(1,2))
par(mar = c(4,4,4,0))
clustered.mat <- t(mean.bd.vals[[i]][names(clustered.genes[[i]])[order(clustered.genes[[i]])],])
cluster.means <- rowMeans(clustered.mat)
mean.order <- order(cluster.means)
bd.means[i,] <- cluster.means
imageWithText(clustered.mat[mean.order,],
show.text = FALSE, use.pheatmap.colors = TRUE, col.names = NULL)
par(mar = c(4,4,4,2))
#p values seem to depend only on how many genes are in the group
diff.test <- aov.by.matrix(t(clustered.mat))
diff.p <- summary(diff.test)[[1]]$"Pr(>F)"[1]
vioplot(t(clustered.mat[rev(mean.order),]),
main = "", horizontal = TRUE,
las = 2, col = geno.cols[rownames(clustered.mat)[rev(mean.order)]])
segments(x0 = 0, y0 = 0, y1 = 5.7)
mtext(names(mean.bd.vals)[i], side = 3, outer = TRUE, font = 2, line = -2.5)
cat("\n\n")
}
#dev.off()
The following plot shows a summary of the mean expression of the biodomains in each genotype.
bd.mean.decomp <- plot.decomp(t(bd.means), plot.results = FALSE)
row.order <- order(bd.mean.decomp$v[,1])
layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.5))
par(mar = c(4,18, 2,1))
imageWithText(signif(bd.means[row.order,], 2), split.at.vals = TRUE,
col.scale = c("blue", "brown"), grad.dir = "ends", row.text.shift = 0.15,
col.text.shift = 0.03)
par(mar = c(4,1,2,2))
barplot(bd.mean.decomp$v[row.order,1], horiz = TRUE, xlab = "PC1")
We did the same analysis above for the subdomains.
The following heat maps show the mean expression of genes across each subdomain. Each has been clustered by medoid clustering.
sbd.mean.file <- file.path(results.dir, "Subdomain_Mean_Expr.RDS")
if(!file.exists(sbd.mean.file)){
mean.sbd.vals <- vector(mode = "list", length = length(mouse.sbd.genes))
names(mean.sbd.vals) <- names(mouse.sbd.genes)
for(bd in 1:length(mouse.sbd.genes)){
if(length(mouse.sbd.genes[[bd]]) > 0){
sub.means <- vector(mode = "list", length = length(mouse.sbd.genes[[bd]]))
names(sub.means) <- names(mouse.sbd.genes[[bd]])
for(s in 1:length(mouse.sbd.genes[[bd]])){
sbd <- mouse.sbd.genes[[bd]][[s]]
sbd.vals <- lapply(1:nrow(sbd),
function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = sbd[x,1], tx_label = sbd[x,2],
ylab = "Expression (A.U.)", plot.results = FALSE)[[1]])
has.vals <- which(sapply(sbd.vals, length) > 0)
sbd.mean <- t(sapply(sbd.vals[has.vals], function(x) sapply(x[[1]], mean)))
rownames(sbd.mean) <- sapply(sbd.vals[has.vals], names)
sub.means[[s]] <- sbd.mean
}
}
mean.sbd.vals[[bd]] <- sub.means
}
saveRDS(mean.sbd.vals, sbd.mean.file)
}else{
mean.sbd.vals <- readRDS(sbd.mean.file)
}
clustered.sd.file <- file.path(results.dir, "Clustered_Coef_SBD.RDS")
#clustered.sd.enrich.file <- file.path(results.dir, "Clustered_Enrich_SBD.RDS")
if(!file.exists(clustered.sd.file)){
clustered.sd.genes <- vector(mode = "list", length = length(mean.sbd.vals))
names(clustered.sd.genes) <- names(mean.sbd.vals)
for(bd in 1:length(mean.sbd.vals)){
if(length(mean.sbd.vals[[bd]]) > 0){
clustered.sd <- vector(mode = "list", length = length(mean.sbd.vals[[bd]]))
names(clustered.sd) <- names(mean.sbd.vals[[bd]])
for(s in 1:length(mean.sbd.vals[[bd]])){
k = 2
if(length(mean.sbd.vals[[bd]][[s]]) > 0){
final.clust <- pam(mean.sbd.vals[[bd]][[s]], k = k)
clustered.sd[[s]] <- final.clust$clustering
#cluster.enrich <- lapply(1:k, function(x) gost(names(clustered.genes[[i]])[which(clustered.genes[[i]] == x)],
# organism = "mmusculus", source = c("GO", "KEGG", "REACTOME", "HP", "CORUM")))
#names(cluster.enrich) <- paste0("cluster", 1:k)
#gene.enrich[[i]] <- cluster.enrich
#plot.enrichment.group(cluster.enrich, max.term.size = 3000, n.terms = 30, cluster_cols = FALSE, plot.label = names(mean.bd.vals)[i])
}
}
clustered.sd.genes[[bd]] <- clustered.sd
}
}
saveRDS(clustered.sd.genes, clustered.sd.file)
#saveRDS(gene.enrich, clustered.enrich.file)
}else{
clustered.sd.genes <- readRDS(clustered.sd.file)
#gene.enrich <- readRDS(clustered.enrich.file)
}
num.sbd <- sum(sapply(mean.sbd.vals, length))
sbd.means <- matrix(nrow = num.sbd, ncol = 5)
sbd.names <- vector(mode = "list", length = length(mean.sbd.vals))
for(i in 1:length(mean.sbd.vals)){
sbd.names[[i]] <- paste(names(mean.sbd.vals)[i], names(mean.sbd.vals[[i]]), sep = " : ")
}
rownames(sbd.means) <- unlist(sbd.names)
colnames(sbd.means) <- genotypes[1:5]
#pdf("~/Desktop/test_sd.pdf", width = 8, height = 4)
idx <- 1
for(bd in 1:length(mean.sbd.vals)){
cat("###", names(mean.sbd.vals[bd]), "{.tabset .tabset-fade .tabset-pills}\n")
if(length(mean.sbd.vals[[bd]]) > 0){
for(s in 1:length(mean.sbd.vals[[bd]])){
cat("####", names(mean.sbd.vals[[bd]][s]), "\n")
#pheatmap(t(mean.bd.vals[[i]][names(clustered.genes[[i]])[order(clustered.genes[[i]])],]),
# cluster_rows = FALSE, cluster_cols = FALSE, show_colnames = FALSE, main = names(mean.bd.vals)[i])
par(mfrow = c(1,2))
par(mar = c(4,4,4,0))
clustered.mat <- t(mean.sbd.vals[[bd]][[s]][names(clustered.sd.genes[[bd]][[s]])[order(clustered.sd.genes[[bd]][[s]])],])
cluster.means <- rowMeans(clustered.mat)
mean.order <- order(cluster.means)
sbd.means[idx,] <- cluster.means
imageWithText(clustered.mat[mean.order,],
show.text = FALSE, use.pheatmap.colors = TRUE, col.names = NULL)
par(mar = c(4,4,4,2))
#p values seem to depend only on how many genes are in the group
diff.test <- aov.by.matrix(t(clustered.mat))
diff.p <- summary(diff.test)[[1]]$"Pr(>F)"[1]
vioplot(t(clustered.mat[rev(mean.order),]),
main = "", horizontal = TRUE,
las = 2, col = geno.cols[rownames(clustered.mat)[rev(mean.order)]])
segments(x0 = 0, y0 = 0, y1 = 5.7)
mtext(rownames(sbd.means)[idx], side = 3, outer = TRUE, font = 2, line = -2.5)
cat("\n\n")
idx <- idx + 1
cat("\n\n")
}
}
cat("\n\n")
}
#dev.off()
The following plot shows a summary of the mean expression of the subdomains in each genotype. Because there are so many subdomains, we subset to only those with a |PC1| value greater than 0.5.
sbd.decomp <- plot.decomp(sbd.means, plot.results = FALSE)
big.idx <- which(abs(sbd.decomp$u[,1]) > 0.1)
row.order <- order(sbd.decomp$u[big.idx,1])
#png("~/Desktop/sbd_overview.png", width = 15, height = 8, units = "in", res = 300)
layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.5))
par(mar = c(4,35, 2,1))
imageWithText(signif(sbd.means[big.idx[row.order],], 2), split.at.vals = TRUE,
col.scale = c("blue", "brown"), grad.dir = "ends", row.text.shift = 0.15,
col.text.shift = 0.05)
par(mar = c(4,1,2,2))
barplot(sbd.decomp$u[big.idx[row.order],1], horiz = TRUE, xlab = "PC1")
plot.dim <- par("usr")
segments(x0 = c(-0.1, 0, 0.1), y0 = 0, y1 = plot.dim[4], lty = c(2,1,2))
#dev.off()
KEGG pathways are AD-agnostic and are organized into different functional groups that could be informative. We also looked at gene expression based on KEGG pathways.
mouse.kegg.file <- file.path(general.data.dir, "KEGG.Mouse.RDS")
if(!file.exists(mouse.kegg.file)){
all.mouse.kegg <- download_KEGG("mmu", "KEGG", "kegg")
saveRDS(all.mouse.kegg, mouse.kegg.file)
}else{
all.mouse.kegg <- readRDS(mouse.kegg.file)
}
human.kegg.file <- file.path(general.data.dir, "KEGG.Human.RDS")
if(!file.exists(human.kegg.file)){
all.human.kegg <- download_KEGG("hsa", "KEGG", "kegg")
saveRDS(all.human.kegg, human.kegg.file)
}else{
all.human.kegg <- readRDS(human.kegg.file)
}
#convert mouse entrez IDs to ensembl IDs
u_path <- gsub(" - Mus musculus (house mouse)", "", all.mouse.kegg[[2]][,"to"], fixed = TRUE)
path.id <- all.mouse.kegg[[2]][,1]
path.idx <- lapply(path.id, function(x) which(all.mouse.kegg[[1]][,1] == unlist(x)[1]))
path.gene.id <- lapply(path.idx, function(x) all.mouse.kegg[[1]][x,2])
names(path.gene.id) <- u_path
mouse.path.gene.ensembl <- lapply(path.gene.id,
function(x) mouse.gene.table[match(x, mouse.gene.table[,"entrezgene_id"]), "ensembl_gene_id"])
saveRDS(mouse.path.gene.ensembl, file.path(general.processed.data.dir, "Mouse_KEGG_for_GSEA.RDS"))
#convert human entrez IDs to ensembl IDs
u_path <- all.human.kegg[[2]][,"to"]
path.id <- all.human.kegg[[2]][,1]
path.idx <- lapply(path.id, function(x) which(all.human.kegg[[1]][,1] == unlist(x)[1]))
path.gene.id <- lapply(path.idx, function(x) all.human.kegg[[1]][x,2])
names(path.gene.id) <- u_path
#use the ortholog table to convert. We only care about genes
#that have mouse orthologs anyway (I think).
hum.path.gene.ensembl <- lapply(path.gene.id,
function(x) orthos[match(x, orthos[,"Human.Entrez"]), "Human.Ensembl"])
saveRDS(hum.path.gene.ensembl, file.path(general.processed.data.dir, "Human_KEGG_for_GSEA.RDS"))
For each mouse KEGG pathway, is there differential expression across genotypes? The results are printed to pdfs in the results folder. There are too many KEGG pathways to display here.
mean.kegg.vals <- vector(mode = "list", length = length(mouse.path.gene.ensembl))
names(mean.kegg.vals) <- names(mouse.path.gene.ensembl)
for(k in 1:length(mouse.path.gene.ensembl)){
k.vals <- lapply(1:length(mouse.path.gene.ensembl[[k]]),
function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = mouse.path.gene.ensembl[[k]][x],
tx_label = NA, ylab = "Expression (A.U.)", plot.results = FALSE)[[1]])
has.vals <- which(sapply(k.vals, length) > 0)
k.mean <- t(sapply(k.vals[has.vals], function(x) sapply(x[[1]], mean)))
#pheatmap(k.mean)
mean.kegg.vals[[k]] <- k.mean
}
test.diff <- lapply(mean.kegg.vals, aov.by.matrix)
test.p <- sapply(test.diff, function(x) x$"Pr(>F)"[1])
#qqunif.plot(test.p)
mean.mean <- t(sapply(mean.kegg.vals, colMeans))
#pdf(file.path(results.dir, "kegg.pdf"), width = 7, height = 42)
#pheatmap(mean.mean, cluster_cols = FALSE)
#dev.off()
term.name = "Virion - Rotavirus"
term.name = "Ribosome"
term.name = "Biotin metabolism"
term.idx <- which(names(mean.kegg.vals) == term.name)
#boxplot(mean.kegg.vals[[term.idx]], main = term.name, ylab = "Scaled Expression")
#pheatmap(mean.kegg.vals[[term.idx]], main = term.name, show_rownames = FALSE, cluster_cols = FALSE)
#mean.kegg.vals[[term.idx]]
Matt said he read a paper that suggested that a useful unit of inquiry was the intersections between GO terms and KEGG terms. These intersections carry both functional information and pathway information.
Here we take any intersection with at least 3 genes
intersect_bd_kegg <- function(bd.list, kegg.list){
intersect.list <- vector(mode = "list", length = length(bd.list))
names(intersect.list) <- names(bd.list)
for(bd in 1:length(bd.list)){
k.intersections <- vector(mode = "list", length = length(kegg.list))
names(k.intersections) <- names(kegg.list)
for(k in 1:length(kegg.list)){
k.intersections[[k]] <- intersect(bd.list[[bd]], kegg.list[[k]])
}
has.genes <- which(sapply(k.intersections, length) >= min.term.size)
intersect.list[[bd]] <- k.intersections[has.genes]
}
return(intersect.list)
}
#save grouped form, so we can do the boxplot thingy that Greg Cary does.
mouse.bd.k.intersections <- intersect_bd_kegg(bd.list = bd.for.gsea, kegg.list = mouse.path.gene.ensembl)
saveRDS(mouse.bd.k.intersections, file.path(general.processed.data.dir, "Mouse_KEGG_Intersections_for_GSEA.RDS"))
human.bd.k.intersections <- intersect_bd_kegg(bd_genes, hum.path.gene.ensembl)
saveRDS(human.bd.k.intersections, file.path(general.processed.data.dir, "Human_KEGG_Intersections_for_GSEA.RDS"))
mean.bd.k.file <- file.path(results.dir, "KEGG_BD_mean_expression.RDS")
if(!file.exists(mean.bd.k.file)){
mean.bd.k.vals <- vector(mode = "list", length = length(mouse.path.gene.ensembl))
names(mean.bd.k.vals) <- names(mouse.path.gene.ensembl)
#for each kegg pathway
for(k in 1:length(mouse.path.gene.ensembl)){
kegg.path <- names(mouse.path.gene.ensembl)[k]
has.intersection <- which(sapply(lapply(mouse.bd.k.intersections, function(x) which(names(x) == kegg.path)), length) > 0)
if(length(has.intersection) == 0){next()}
intersect.mean.vals <- vector(mode = "list", length = length(has.intersection))
names(intersect.mean.vals) <- names(has.intersection)
all.val.mat <- matrix(NA, nrow = length(mouse.bd.genes), ncol = 5)
rownames(all.val.mat) <- names(mouse.bd.genes)
colnames(all.val.mat) <- genotypes[1:5]
#pull out all gene values for intersections with biodomains
for(bd in 1:length(has.intersection)){
kegg.idx <- sapply(mouse.bd.k.intersections[has.intersection], function(x) which(names(x) == kegg.path))
intersect.ids <- mouse.bd.k.intersections[[has.intersection[bd]]][[kegg.idx[bd]]]
bd.k.vals <- lapply(intersect.ids,
function(x) plot_tx_with_genotype(expr.mat = scaled.expr, covar.table = covar.mat,
tx_name = x, tx_label = NA, ylab = "Expression (A.U.)", plot.results = FALSE)[[1]])
has.vals <- which(sapply(bd.k.vals, length) > 0)
k.mean <- t(sapply(bd.k.vals[has.vals], function(x) sapply(x[[1]], mean)))
#pheatmap(k.mean)
rownames(k.mean) <- intersect.ids[has.vals]
intersect.mean.vals[[bd]] <- k.mean
}
has.vals <- which(sapply(intersect.mean.vals, length) > 0)
mean.mat <- t(sapply(intersect.mean.vals[has.vals], colMeans))
all.val.mat[rownames(mean.mat), colnames(mean.mat)] <- mean.mat[rownames(mean.mat), colnames(mean.mat)]
mean.bd.k.vals[[k]] <- all.val.mat
#pheatmap(mean.bd.k.vals[[k]], main = names(path.gene.ensembl)[k], cluster_cols = FALSE, cluster_rows = FALSE)
}
saveRDS(mean.bd.k.vals, mean.bd.k.file)
}else{
mean.bd.k.vals <- readRDS(mean.bd.k.file)
}
has.vals <- which(sapply(mean.bd.k.vals, length) > 0)
min.val <- min(unlist(mean.bd.k.vals[has.vals]), na.rm = TRUE)
max.val <- max(unlist(mean.bd.k.vals[has.vals]), na.rm = TRUE)
#custom.dist <- unlist(mean.bd.k.vals[has.vals])
custom.dist <- rnorm(1000, 0, 0.2)
#hist(custom.dist)
kegg.plot.dir <- file.path(results.dir, "KEGG_plots")
if(!file.exists(kegg.plot.dir)){dir.create(kegg.plot.dir)}
pdf(file.path(kegg.plot.dir, "kegg_bd_intersections.pdf"), width = 7, height = 7)
hist(unlist(mean.bd.k.vals), xlab = "Mean Value", main = "Mean Biodomain-Kegg Intersection Expression")
for(i in 1:length(mean.bd.k.vals)){
if(length(mean.bd.k.vals[[i]]) > 0){
par(mar = c(4,12,2,2))
neg.vals <- length(which(mean.bd.k.vals[[i]] < 0))
pos.vals <- length(which(mean.bd.k.vals[[i]] > 0))
if(neg.vals > 0 && pos.vals > 0){
col.scale = c("blue", "brown")
grad.dir = "ends"
split.at.vals = TRUE
}
if(neg.vals > 0 && pos.vals == 0){
col.scale = "blue"
grad.dir = "low"
split.at.vals = TRUE
}
if(neg.vals == 0 && pos.vals > 0){
col.scale = "brown"
grad.dir = "high"
split.at.vals = TRUE
}
imageWithText(mat = mean.bd.k.vals[[i]], main = names(mean.bd.k.vals)[i],
col.scale = col.scale, grad.dir = grad.dir, col.text.rotation = 0,
col.text.adj = 0.5, col.text.shift = 0.06, row.text.shift = 0.15,
main.shift = 0.05, color.fun = "linear", split.at.vals = split.at.vals,
light.dark = "f", global.color.scale = TRUE, global.min = min.val,
global.max = max.val)
}
}
dev.off()
## quartz_off_screen
## 2
global.min <- min(unlist(mean.bd.k.vals), na.rm = TRUE)
global.max <- max(unlist(mean.bd.k.vals), na.rm = TRUE)
#plot all results for each biodomain on one page
for(bd in 1:length(bd_genes)){
bd.mat <- t(sapply(mean.bd.k.vals, function(x) if(length(x) > 0){x[bd,]}else{rep(NA, 5)}))
has.vals <- which(!is.na(rowMeans(bd.mat)))
#plot.decomp(t(bd.mat[has.vals,]), label.points = TRUE, main = names(bd_genes)[bd])
#bd.decomp <- plot.decomp(bd.mat[has.vals,], main = names(bd_genes)[bd])
#plot.decomp(bd.mat[has.vals,], main = names(bd_genes)[bd], label.points = TRUE)
if(length(has.vals) > 0){
png(file.path(kegg.plot.dir, paste0("bd_kegg_intersections_", names(bd_genes)[bd], ".png")),
width = 7, height = max(c(length(has.vals)/7, 7)), units = "in", res = 200)
row.order <- hclust(dist(bd.mat[has.vals,]))$order
par(mar = c(4,16, 2, 2))
test <- imageWithText(mat = bd.mat[has.vals[row.order],], main = names(bd_genes)[bd],
col.scale = c("blue", "brown"), grad.dir = "ends", col.text.rotation = 0,
col.text.adj = 0.5, col.text.shift = 0.01, row.text.shift = 0.15,
main.shift = 0.01, color.fun = "linear", split.at.vals = TRUE,
color.dist = NULL, light.dark = "f", global.color.scale = TRUE,
global.min = global.min, global.max = global.max)
dev.off()
}
}
We put all the results together to see if there were any combinations that stood out overall. The plot below shows that the expression across all KEGG pathway and biodomain intersections separates the genotypes nicely. Along the first principal component, the VS genotypes are negative, the WT mice are at 0, and the FC genotypes are positive.
big.mat <- Reduce("rbind", mean.bd.k.vals)
group.names <- unlist(lapply(1:length(mean.bd.k.vals), function(x) if(length(mean.bd.k.vals[[x]] > 0)){paste(names(mean.bd.k.vals)[x], rownames(mean.bd.k.vals[[x]]), sep = "--")}))
rownames(big.mat) <- group.names
#pdf("~/Desktop/big.mat.pdf", height = 20, width = 10)
#pheatmap(big.mat, cluster_rows = FALSE, cluster_cols = FALSE)
#dev.off()
not.na <- which(!is.na(rowSums(big.mat)))
sub.mat <- big.mat[not.na,]
process.decomp <- plot.decomp(t(sub.mat), label.points = TRUE, xlim = c(-1, 1),
cols = geno.cols[match(names(geno.cols), colnames(sub.mat))], cex = 1.5)
process.vals <- process.decomp$v
pc.order <- order(process.decomp$u[,1])
barplot(process.decomp$u[pc.order,1],
col = geno.cols[match(names(geno.cols), colnames(sub.mat))][pc.order],
names = colnames(sub.mat)[pc.order], ylab = "PC1")
abline(h = 0)
#process.decomp <- plot.decomp(big.mat[has.vals,], label.points = TRUE)
#plot(process.decomp$v, type = "n")
#text(process.decomp$v[,1], process.decomp$v[,2], labels = colnames(big.mat))
#process.vals <- process.decomp$u
The processes don’t have any particular clustering as shown below. But there are some pathways that have large negative and positive values in the first PC.
plot(process.decomp$v, xlab = "PC1", ylab = "PC2", pch = 16)
large.neg <- which(process.decomp$v[,1] < -0.04)
large.pos <- which(process.decomp$v[,1] > 0.05)
text(process.decomp$v[large.pos,1], process.decomp$v[large.pos,2], labels = rownames(sub.mat)[large.pos], cex = 0.5)
text(process.decomp$v[large.neg,1], process.decomp$v[large.neg,2], labels = rownames(sub.mat)[large.neg], cex = 0.5)
The plots below show the mean expression for each of these processes with large PC values.
pc.order <- order(process.vals[,1], decreasing = TRUE)
most.neg <- tail(pc.order, 20)
most.pos <- head(pc.order, 20)
#most.pos <- pc.order[1620:1630] #check middle
#test.mat <- process.decomp$v[most.neg,1,drop=FALSE]
#rownames(test.mat) <- rownames(sub.mat)[most.neg]
#test.mat[order(test.mat[,1]),]
par(mar = c(4, 24, 2, 2))
layout(matrix(c(1,2), ncol = 2), widths = c(1, 0.5))
imageWithText(sub.mat[rev(most.neg),], col.scale = c("blue", "brown"), split.at.vals = TRUE,
grad.dir = "ends", col.text.rotation = 0, col.text.adj = 0.5, col.text.shift = 0.05, row.text.cex = 0.7,
row.text.shift = 0.15, main = "Large Negative PC pathways")
par(mar = c(3.8, 0, 1.8, 2))
barplot(process.vals[most.neg,1], horiz = TRUE)
mtext("PC1", side = 1, line = 2.2)
#test.mat <- process.decomp$v[most.pos,1,drop=FALSE]
#rownames(test.mat) <- rownames(sub.mat)[most.pos]
#test.mat[order(test.mat[,1]),]
par(mar = c(4, 18, 2, 0))
#row.order <- hclust(dist(sub.mat[most.pos,]))$order
layout(matrix(c(1,2), ncol = 2), widths = c(1, 0.5))
imageWithText(sub.mat[most.pos,], col.scale = c("blue", "brown"), split.at.vals = TRUE,
grad.dir = "ends", col.text.rotation = 0, col.text.adj = 0.5, col.text.shift = 0.05, row.text.cex = 0.7,
row.text.shift = 0.15, main = "Large Positive PC pathways")
par(mar = c(3.8, 0, 1.8, 2))
barplot(process.vals[rev(most.pos),1], horiz = TRUE)
mtext("PC1", side = 1, line = 2.2)
Can we build a biodomain network that shows us where the biggest differences are? The biodomains were designed such
#This is quite derived, but let's see what happens when we collect
#the loadings on PC1 for each kegg-biodomain pair.
umat <- matrix(0, nrow = length(mouse.path.gene.ensembl), ncol = length(bd_genes))
rownames(umat) <- names(mouse.path.gene.ensembl)
colnames(umat) <- names(bd_genes)
split.names <- strsplit(rownames(sub.mat), "--")
split.kegg <- sapply(split.names, function(x) x[1])
split.bd <- sapply(split.names, function(x) x[2])
for(i in 1:nrow(process.vals)){
umat[split.kegg[i], split.bd[i]] <- process.vals[i,1]
}
#pheatmap(umat)
no.vals <- which(rowSums(umat) == 0)
has.vals <- which(rowSums(umat) != 0)
row.order <- order(rowSums(umat[has.vals,]))
col.order <- order(colSums(umat[has.vals,]))
pdf(file.path(kegg.plot.dir, "KEGG_Umat.pdf"), width = 9, height = 40)
pheatmap(umat[has.vals[row.order],col.order], cluster_cols = TRUE, cluster_rows = TRUE)
dev.off()
## pdf
## 3
The following image shows just the rows with strong loadings for a given biodomain-KEGG intersection.
#filter to rows with the largest values
#hist(umat)
lower.thresh <- -0.04
upper.thresh <- 0.04
high.val <- which(apply(umat, 1, function(x) any(x > upper.thresh)))
low.val <- which(apply(umat, 1, function(x) any(x < lower.thresh)))
strong.vals <- union(high.val, low.val)
pheatmap(umat[strong.vals,])
#pdf("~/Desktop/sub_mat.pdf", width = 8, height = 9)
#pheatmap(umat[strong.vals,])
#dev.off()
#net <- graph_from_incidence_matrix(abs(umat[strong.vals,]))
#plot(net)
#pdf("~/Desktop/bip_plot.pdf", width = 10, height = 10)
#plot_bipartite(abs(umat[strong.vals,]))
#dev.off()
if(need.to.download){
synLogout()
}